

##################################################################
##################################################################
## Replication Material
##
## Caramani, Gurova & Widmann: 
## The Evolution of Global Cleavages: A Historical Analysis of Territorial and Functional
## World Alignments Based on Automated Text Analysis, 1843−2020
## Comparative Political Studies
##
##
##################################################################
##################################################################

# Note: The file 000_readme.pdf describes all scripts and datasets required to replicate the analysis

# This script was run on the following R version, platform and OS:
# R version 4.3.1 (2021-03-31)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# Running under: Windows 10 x64 (build 19045)


sessionInfo()


#### Load Packages #####################################

install.packages("groundhog")
library("groundhog")

pkgs <- c("quanteda","LSX", "tidyverse", "plm", "lmtest", "openxlsx",
          "tidyr", "patchwork", "caret")
groundhog.library(pkgs, "2024-04-22")



#### Working directory ###################################

# Set your working directory to the downloaded replication folder
setwd("./files/")

#### Loading raw data #####################################
# NOTE: The raw data from The Economist cannot be shared publicly due to copyright reasons.
# Please refer to the 000_README.pdf file in the replication material for a detailed description
# of how to collect these data manually.


# raw Economist data
load("./df_raw_complete.Rdata")


#### Data cleaning ######################################################
# Copy the original dataframe to a new one for cleaning
df_clean <- df_raw_complete

# Copy the 'text' column to a new column 'sent.text' for further processing
df_clean$sent.text <- df_clean$text

# Remove instances of "- " in the 'sent.text' column
df_clean$sent.text <- gsub("- ", "", df_clean$sent.text)
# Remove instances of " -" in the 'sent.text' column
df_clean$sent.text <- gsub(" -", "", df_clean$sent.text)
# Remove parentheses "()" in the 'sent.text' column
df_clean$sent.text <- gsub("[()]", "", df_clean$sent.text)
# Remove instances of "| " in the 'sent.text' column
df_clean$sent.text <- gsub("\\| ", "", df_clean$sent.text)
# Remove instances of "|" in the 'sent.text' column
df_clean$sent.text <- gsub("\\|", "", df_clean$sent.text)
# Remove instances of "¬ " in the 'sent.text' column
df_clean$sent.text <- gsub("¬ ", "", df_clean$sent.text)
# Remove instances of " ¬" in the 'sent.text' column
df_clean$sent.text <- gsub(" ¬", "", df_clean$sent.text)
# Correct the spelling from "partiament" to "parliament" in the 'sent.text' column
df_clean$sent.text <- gsub("partiament", "parliament", df_clean$sent.text)
# Replace double spaces with a single space in the 'sent.text' column
df_clean$sent.text <- gsub("  ", " ", df_clean$sent.text)

# Remove rows where 'sent.text' is NA
df_clean <- df_clean[!is.na(df_clean$sent.text),]
# Remove rows where 'sent.text' is an empty string
df_clean <- df_clean[!df_clean$sent.text=="",]

# Count the number of terms (words) in the 'text' column
df_clean$terms <- str_count(df_clean$text, "\\s")
# Increment the term count by 1 to get the actual number of words
df_clean$terms <- df_clean$terms + 1

# Filter out rows where the number of terms is 150 or less
df_clean_removed150 <- df_clean[df_clean$terms > 150,]


#### Prepare Data ##############################

#Create corpus
corpus <- corpus(df_clean_removed150$sent.text, docvars = df_clean_removed150)
corp_sent <- corpus_reshape(corpus, to =  "sentences")

# tokenize text corpus and remove various features
toks_sent <- corp_sent %>% 
  tokens(remove_punct = TRUE, remove_symbols = TRUE, 
         remove_numbers = TRUE, remove_url = TRUE) %>% 
  tokens_remove(stopwords("en", source = "marimo"))


# create a document feature matrix from the tokens object
dfmat_sent <- toks_sent %>% 
  dfm() %>% 
  dfm_remove(pattern = "") %>% 
  dfm_trim(min_termfreq = 50, max_termfreq = 250000)

# set the seed words for functional and territorial
dic <- list(positive= NA, negative = NA)
dic$positive <- list("bloc*", "region*", "civilisation*", "countr*", "nation*")

dic$negative <- list("class*", "ideolog*", "inequalit*", "group*", "social*")


# define seed words
seed <- as.seedwords(dic)
print(seed)

#### Run main LSX analysis ###################

# run LSS model
tmod_lss <- textmodel_lss(dfmat_sent, seeds = seed,
                          k = 300, cache = TRUE)

class(tmod_lss)
# save LSS model
#save(tmod_lss, file = ".model.Rdata")


# load pretrained model
load("./model.Rdata")

dfmat_doc <- dfm_group(dfmat_sent)
dat <- docvars(dfmat_doc)

dat$year <- gsub("(\\d{4}).*", "\\1", dat$date)
dat$year <- gsub("(\\d{4})", "\\1-01-01", dat$year)
dat$year <- as.Date(dat$year, "%Y-%m-%d")

# predict scores for documents
dat$fit <- predict(tmod_lss, newdata = dfmat_doc)


#### Figure 3 ###########

# Textplots

# for illustrative purposes, we focus on the "main" seed words without using asterisks
dic2 <- list(positive= NA, negative = NA)
dic2$positive <- list("bloc", "region", "civilisation", "country", "nation")

dic2$negative <- list("class", "ideology", "inequality", "group", "social")

textplot_terms(tmod_lss, dic2)

# save plot
ggsave("./figures/figure3.png", width = 10, height = 8, dpi = 300, units = "in", device='png')  

#### Table 2 ############

df_perf <- openxlsx::read.xlsx("./sample_human_coding.xlsx")

df_perf <- df_perf[!is.na(df_perf$hscore),]

df_perf$cscore <- 0
df_perf$cscore[df_perf$fit>0.2] <- 1
df_perf$cscore[df_perf$fit<(-0.2)] <- -1

df_perf$functional_h <- 0
df_perf$functional_h[df_perf$hscore==(-1)] <- 1
df_perf$functional_c <- 0
df_perf$functional_c[df_perf$cscore==(-1)] <- 1

df_perf$territorial_h <- 0
df_perf$territorial_h[df_perf$hscore==(1)] <- 1
df_perf$territorial_c <- 0
df_perf$territorial_c[df_perf$cscore==(1)] <- 1

df_perf$neutral_h <- 0
df_perf$neutral_h[df_perf$hscore==0] <- 1
df_perf$neutral_c <- 0
df_perf$neutral_c[df_perf$cscore==0] <- 1

performance_df_fun <- NULL
#Performance of FUNCTIONAL
results_fun <- caret::confusionMatrix(as.factor(df_perf$functional_c), as.factor(df_perf$functional_h), positive = "1")
results_fun$byClass
performance_df_fun$accuracy <- results_fun$overall[1]
performance_df_fun$precision <- results_fun$byClass[5]
performance_df_fun$recall <- results_fun$byClass[6]
performance_df_fun$f1 <- results_fun$byClass[7]
performance_df_fun <- as.data.frame(performance_df_fun)
performance_df_fun$cleavage <- "functional"

performance_df_ter <- NULL
# Performance of TERRITORIAL
results_ter <- caret::confusionMatrix(as.factor(df_perf$territorial_c), as.factor(df_perf$territorial_h), positive = "1")
results_ter$byClass
performance_df_ter$accuracy <- results_ter$overall[1]
performance_df_ter$precision <- results_ter$byClass[5]
performance_df_ter$recall <- results_ter$byClass[6]
performance_df_ter$f1 <- results_ter$byClass[7]
performance_df_ter <- as.data.frame(performance_df_ter)
performance_df_ter$cleavage <- "territorial"

performance_df <- rbind(performance_df_fun, performance_df_ter)
write.xlsx(performance_df, file = "./tables/table2.xlsx")


#### Figure 4 ###########


df2 <- aggregate(dat$fit, list(dat$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ylim(-2, 2) +
  geom_hline(yintercept = 0, linetype="dashed") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 
ggsave("./figures/figure4.png", width = 10, height = 8, dpi = 300, units = "in", device='png')  

#### Figure 5 ############################################

df2 <- aggregate(dat$fit, list(dat$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  geom_hline(yintercept = 0, linetype="dashed") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") +
  geom_vline(xintercept= as.Date("1853-01-01"), linetype = "dashed", colour = "grey40") +
  geom_text(aes(x=as.Date("1853-01-01"), label="Crimean War", y=-.5), colour="grey40", angle=90, vjust = 1, size = 7) +
  geom_vline(xintercept= as.Date("1859-01-01"), linetype = "dashed", colour = "grey40") +
  geom_text(aes(x=as.Date("1859-01-01"), label="Italian War of Independence", y=-.5), colour="grey40", angle=90, vjust = 1, size = 7) +
  geom_vline(xintercept= as.Date("1897-04-01"), linetype = "dashed", colour = "grey40") +
  geom_text(aes(x=as.Date("1897-04-01"), label="Ottoman-Greek War", y=-.5), colour="grey40", angle=90, vjust = 1, size = 7) +
  geom_vline(xintercept= as.Date("1914-01-01"), linetype = "dashed", colour = "grey40") +
  geom_text(aes(x=as.Date("1914-01-01"), label="World War 1", y=-.5), colour="grey40", angle=90, vjust = 1, size = 7) +
  geom_vline(xintercept= as.Date("1939-01-01"), linetype = "dashed", colour = "grey40") +
  geom_text(aes(x=as.Date("1939-01-01"), label="World War 2", y=-.5), colour="grey40", angle=90, vjust = 1, size = 7) +
  geom_vline(xintercept= as.Date("1973-01-01"), linetype = "dashed", colour = "grey40") +
  geom_text(aes(x=as.Date("1973-01-01"), label="Yom Kippur War", y=-.5), colour="grey40", angle=90, vjust = 1, size = 7) +
  geom_vline(xintercept= as.Date("1990-08-01"), linetype = "dashed", colour = "grey40") +
  geom_text(aes(x=as.Date("1990-08-01"), label="Gulf War", y=-.5), colour="grey40", angle=90, vjust = 1, size = 7)

ggsave("./figures/figure5.png", width = 20, height = 16, dpi = 300, units = "in", device='png')  

#### Figure 6 ##########################################

# creating our own topic dictionaries
topic_dictionary <- dictionary(list(
  military=c("war","militar*", "weapon*", "tank*", "defense", "conflict"),
  economy=c("econom*", "income", "development", "resource*", "trade*", "GDP", "rich", "poor"),
  politics=c("power", "government*", "voting", "influence", "rights", "freedom", "politic*"),
  social=c("social", "education", "health", "mortality", "gender", "wom?n", "literacy", "*migra*"),
  cultural=c("cultur*", "religion*", "*sexuality", "ethnic*", "minorit*", "*tolerance"),
  sport=c("sport*", "*ball*", "score*", "team*", "player*")
))


topic_dictionary

# Corpus
corp_da <- corpus(dat)

# tokenize texts
toks_da <- tokens(corp_da, remove_punct = TRUE)

# create a document-feature matrix
dfmat_da <- dfm(toks_da)

# And we can apply the dictionary to our data
topics <- dfm_lookup(dfmat_da, topic_dictionary)

topic_df <- cbind(dat, convert(topics, to = 'data.frame'))


##### Military #########
military_df <- topic_df[topic_df$military>=3,]


military_df$year <- gsub("(\\d{4}).*", "\\1", military_df$date)
military_df$year <- gsub("(\\d{4})", "\\1-01-01", military_df$year)
military_df$year <- as.Date(military_df$year, "%Y-%m-%d")

df2 <- aggregate(military_df$fit, list(military_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p1 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +  
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  ggtitle("Military") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 


##### sport #########
sport_df <- topic_df[topic_df$sport>=3,]


sport_df$year <- gsub("(\\d{4}).*", "\\1", sport_df$date)
sport_df$year <- gsub("(\\d{4})", "\\1-01-01", sport_df$year)
sport_df$year <- as.Date(sport_df$year, "%Y-%m-%d")

df2 <- aggregate(sport_df$fit, list(sport_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p2 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Sports") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

##### politics #########
politics_df <- topic_df[topic_df$politics>=3,]


politics_df$year <- gsub("(\\d{4}).*", "\\1", politics_df$date)
politics_df$year <- gsub("(\\d{4})", "\\1-01-01", politics_df$year)
politics_df$year <- as.Date(politics_df$year, "%Y-%m-%d")

df2 <- aggregate(politics_df$fit, list(politics_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p3 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Politics") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 



##### economy #########
economy_df <- topic_df[topic_df$economy>=3,]


economy_df$year <- gsub("(\\d{4}).*", "\\1", economy_df$date)
economy_df$year <- gsub("(\\d{4})", "\\1-01-01", economy_df$year)
economy_df$year <- as.Date(economy_df$year, "%Y-%m-%d")

df2 <- aggregate(economy_df$fit, list(economy_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p4 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Economy") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 


##### social #########
social_df <- topic_df[topic_df$social>=3,]


social_df$year <- gsub("(\\d{4}).*", "\\1", social_df$date)
social_df$year <- gsub("(\\d{4})", "\\1-01-01", social_df$year)
social_df$year <- as.Date(social_df$year, "%Y-%m-%d")

df2 <- aggregate(social_df$fit, list(social_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p5 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Social") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

##### cultural #########
cultural_df <- topic_df[topic_df$cultural>=3,]


cultural_df$year <- gsub("(\\d{4}).*", "\\1", cultural_df$date)
cultural_df$year <- gsub("(\\d{4})", "\\1-01-01", cultural_df$year)
cultural_df$year <- as.Date(cultural_df$year, "%Y-%m-%d")

df2 <- aggregate(cultural_df$fit, list(cultural_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p6 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Culture") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

##### Combining plots ########
combined_plot <- (p4 | p1) / (p6 | p5) / (p3 | p2)
combined_plot
ggsave("./figures/figure6.png", width = 15, height = 20, dpi = 300, units = "in", device='png')  


#### Figure A1 ##############

# Histogram
ggplot(dat, aes(x=fit)) +
  geom_histogram(alpha = .6, colour="grey", fill = "grey") +
  geom_vline(data=dat, aes(xintercept=mean(fit)),
             linetype="dashed", size=1) +
  theme_bw()
ggsave("./figures/figureA1.png", width = 10, height = 8, dpi = 300, units = "in", device='png')  

#### Table A1 ##############

# Assuming `coef(tmod_lss)` returns a named numeric vector of word coefficients
coefficients <- coef(tmod_lss)

# Extract the most positive words
most_positive <- head(coefficients, 20)

# Extract the most negative words
most_negative <- tail(coefficients, 20)

# Create a data frame combining the positive and negative words
words_df <- data.frame(
  Word = c(names(most_positive), names(most_negative)),
  Coefficient = c(most_positive, most_negative),
  Sentiment = c(rep("Positive", length(most_positive)), rep("Negative", length(most_negative)))
)

write.xlsx(words_df, file = "./tables/tableA1.xlsx")

#### Figure A2 #############

# First, we start with replacing one by one seed words on the territorial end of the scale
dat2 <- dat

dat2$year <- gsub("(\\d{4}).*", "\\1", dat2$date)
dat2$year <- gsub("(\\d{4})", "\\1-01-01", dat2$year)
dat2$year <- as.Date(dat2$year, "%Y-%m-%d")

dat2 <- aggregate(dat2$fit, list(dat2$year), mean, na.rm=T)
colnames(dat2)[1] <- "date"
colnames(dat2)[2] <- "fit"


# Initialize a list to store the plots
plot_list <- list()

for (i in 1:5){
  
  # 5 words
  dic <- list(positive = NA, negative = NA)
  pos <- c("bloc*", "region*", "civilisation*", "countr*", "nation*")
  neg <- c("class*", "ideolog*", "inequalit*", "group*", "social*")
  
  j <- pos[i]
  j <- gsub("\\*", "", j)
  pos <- pos[-i]
  
  dic$positive <- list(pos)
  dic$negative <- list(neg)
  
  seed <- as.seedwords(dic)
  
  # Run LSS model
  tmod_lss <- textmodel_lss(dfmat_sent, seeds = seed, k = 300, cache = TRUE)
  
  dfmat_doc <- dfm_group(dfmat_sent)
  dat_temp <- docvars(dfmat_doc)
  dat_temp$fit <- predict(tmod_lss, newdata = dfmat_doc)
  
  # Aggregated by YEAR 
  dat_temp$year <- gsub("(\\d{4}).*", "\\1", dat_temp$date)
  dat_temp$year <- gsub("(\\d{4})", "\\1-01-01", dat_temp$year)
  dat_temp$year <- as.Date(dat_temp$year, "%Y-%m-%d")
  
  dat_temp <- aggregate(dat_temp$fit, list(dat_temp$year), mean, na.rm = TRUE)
  colnames(dat_temp)[1] <- "date"
  colnames(dat_temp)[2] <- "fit"
  
  dat_temp$fit2 <- dat2$fit
  
  dat3 <- dat_temp %>% 
    pivot_longer(!date, names_to = "fit", values_to = "value")
  
  plot <- ggplot(dat3, aes(x = date, y = value, colour = fit)) +
    geom_smooth(alpha = 0.5) +
    stat_smooth(alpha = 0.2) +
    geom_line(alpha = 0.5) +
    theme_bw() +
    ggtitle(j) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    scale_color_manual(name = "Fit", labels = c("missing word", "original"),
                       values = c("#00AFBB", "#FC4E07")) +
    scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                    "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                    "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                    "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                    "2010-01-01", "2020-01-01")), 
                 minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                     to = max(x), by = "5 years"), 
                 date_labels = "%Y") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
  
  # Add the plot to the list
  plot_list[[i]] <- plot
  print(i)
}

# Combine all plots into one using patchwork
figureA2_1 <- wrap_plots(plot_list, ncol = 2, nrow = 3)
figureA2_1

# Save the combined plot
ggsave("./figures/figureA2_1.png", figureA2_1, width = 10, height = 12, dpi = 300, units = "in", device = 'png')

# Now, we replace one by one seed words on the functional end of the scale
dat2 <- dat


dat2$year <- gsub("(\\d{4}).*", "\\1", dat2$date)
dat2$year <- gsub("(\\d{4})", "\\1-01-01", dat2$year)
dat2$year <- as.Date(dat2$year, "%Y-%m-%d")

dat2 <- aggregate(dat2$fit, list(dat2$year), mean, na.rm=T)
colnames(dat2)[1] <- "date"
colnames(dat2)[2] <- "fit"


# Initialize a list to store the plots
plot_list <- list()

for (i in 1:5){
  
  # 5 words
  dic <- list(positive = NA, negative = NA)
  pos <- c("bloc*", "region*", "civilisation*", "countr*", "nation*")
  neg <- c("class*", "ideolog*", "inequalit*", "group*", "social*")
  
  j <- neg[i]
  j <- gsub("\\*", "", j)
  neg <- neg[-i]
  
  dic$positive <- list(pos)
  dic$negative <- list(neg)
  
  seed <- as.seedwords(dic)
  
  # Run LSS model
  tmod_lss <- textmodel_lss(dfmat_sent, seeds = seed, k = 300, cache = TRUE)
  
  dfmat_doc <- dfm_group(dfmat_sent)
  dat_temp <- docvars(dfmat_doc)
  dat_temp$fit <- predict(tmod_lss, newdata = dfmat_doc)
  
  # Aggregated by YEAR 
  dat_temp$year <- gsub("(\\d{4}).*", "\\1", dat_temp$date)
  dat_temp$year <- gsub("(\\d{4})", "\\1-01-01", dat_temp$year)
  dat_temp$year <- as.Date(dat_temp$year, "%Y-%m-%d")
  
  dat_temp <- aggregate(dat_temp$fit, list(dat_temp$year), mean, na.rm = TRUE)
  colnames(dat_temp)[1] <- "date"
  colnames(dat_temp)[2] <- "fit"
  
  dat_temp$fit2 <- dat2$fit
  
  dat3 <- dat_temp %>% 
    pivot_longer(!date, names_to = "fit", values_to = "value")
  
  plot <- ggplot(dat3, aes(x = date, y = value, colour = fit)) +
    geom_smooth(alpha = 0.5) +
    stat_smooth(alpha = 0.2) +
    geom_line(alpha = 0.5) +
    theme_bw() +
    ggtitle(j) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    scale_color_manual(name = "Fit", labels = c("missing word", "original"),
                       values = c("#00AFBB", "#FC4E07")) +
    scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                    "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                    "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                    "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                    "2010-01-01", "2020-01-01")), 
                 minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                     to = max(x), by = "5 years"), 
                 date_labels = "%Y") +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
  
  # Add the plot to the list
  plot_list[[i]] <- plot
  print(i)
}

# Combine all plots into one using patchwork
figureA2_2 <- wrap_plots(plot_list, ncol = 2, nrow = 3)
figureA2_2

# Save the combined plot
ggsave("./figures/figureA2_2.png", figureA2_2, width = 10, height = 12, dpi = 300, units = "in", device = 'png')


#### Table A2 #####################
dat_original <- dat

# Initialize the data frame with the correct number of rows
df_cor <- data.frame(correlation = rep(NA, 5), word = rep(NA, 5))

for (i in 1:5){
  
  # 5 words
  dic <- list(positive = NA, negative = NA)
  pos <- c("bloc*", "region*", "civilisation*", "countr*", "nation*")
  neg <- c("class*", "ideolog*", "inequalit*", "group*", "social*")
  
  j <- pos[i]
  j <- gsub("\\*", "", j)
  pos <- pos[-i]
  
  dic$positive <- list(pos)
  dic$negative <- list(neg)
  
  seed <- as.seedwords(dic)
  
  # Run LSS model
  tmod_lss <- textmodel_lss(dfmat_sent, seeds = seed, k = 300, cache = TRUE)
  
  dfmat_doc <- dfm_group(dfmat_sent)
  dat_temp <- docvars(dfmat_doc)
  dat_temp$fit <- predict(tmod_lss, newdata = dfmat_doc)
  cor <- cor(dat_original$fit, dat_temp$fit)
  
  df_cor$correlation[i] <- cor
  df_cor$word[i] <- j
  
  print(i)
}


# Initialize the data frame with the correct number of rows
df_cor2 <- data.frame(correlation = rep(NA, 5), word = rep(NA, 5))

for (i in 1:5){
  
  # 5 words
  dic <- list(positive = NA, negative = NA)
  pos <- c("bloc*", "region*", "civilisation*", "countr*", "nation*")
  neg <- c("class*", "ideolog*", "inequalit*", "group*", "social*")
  
  j <- neg[i]
  j <- gsub("\\*", "", j)
  neg <- neg[-i]
  
  dic$positive <- list(pos)
  dic$negative <- list(neg)
  
  seed <- as.seedwords(dic)
  
  # Run LSS model
  tmod_lss <- textmodel_lss(dfmat_sent, seeds = seed, k = 300, cache = TRUE)
  
  dfmat_doc <- dfm_group(dfmat_sent)
  dat_temp <- docvars(dfmat_doc)
  dat_temp$fit <- predict(tmod_lss, newdata = dfmat_doc)
  cor <- cor(dat_original$fit, dat_temp$fit)
  
  df_cor2$correlation[i] <- cor
  df_cor2$word[i] <- j
  
  print(i)
}

df_correlation <- rbind(df_cor, df_cor2)

# Calculate the mean correlation
mean_correlation <- mean(df_correlation$correlation, na.rm = TRUE)

# Create a new row for the average correlation
average_row <- data.frame(correlation = mean_correlation, word = "Average")

# Add the new row to the original data frame
df_correlation <- rbind(df_correlation, average_row)

# Round all numeric values to two decimal places
df_correlation$correlation <- round(df_correlation$correlation, 2)

# Save the data frame to an Excel file
write.xlsx(df_correlation, file = "./tables/tableA2.xlsx", rowNames = FALSE)

#### OA Figure 1 ##############################

word_list2 <- c("united kingdom", "uk", "england", "scotland", "northern ireland", "wales", "great britain")
dat$sent.text2 <- tolower(dat$sent.text)
dat$domestic_count <- 0
dat$domestic_count <- str_count(dat$sent.text2, paste(word_list2, collapse = "|")) # Count stopwords in sentence

dat$domestic_binary <- 0
dat$domestic_binary[grepl(paste(word_list2, collapse = "|"), dat$sent.text2)] <- 1 # keep only lines with terms


dat$domestic_binary2 <- 0
dat$domestic_binary2[dat$domestic_count>=3] <- 1

df2 <- aggregate(dat$domestic_count, list(dat$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Proportion of Domestic Articles") +
  geom_hline(yintercept = 0, linetype="dashed") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

ggsave("./figures/oa_figure1.png", width = 10, height = 8, dpi = 300, units = "in", device='png')  

#### OA Figure 2 ######################################

df2 <- aggregate(dat$domestic_binary, list(dat$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Proportion of Domestic Articles") +
  geom_hline(yintercept = 0, linetype="dashed") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

ggsave("./figures/oa_figure2.png", width = 10, height = 8, dpi = 300, units = "in", device='png')  


#### OA Figure 2 ######################################

df2 <- aggregate(dat$domestic_binary2, list(dat$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Proportion of Domestic Articles") +
  geom_hline(yintercept = 0, linetype="dashed") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

ggsave("./figures/oa_figure3.png", width = 10, height = 8, dpi = 300, units = "in", device='png')  



#### OA Figure 4 ########################################

df_temp <- dat[dat$domestic_count<1,]

df_temp2 <- aggregate(df_temp$fit, list(df_temp$year), mean, na.rm=T)
colnames(df_temp2)[1] <- "date"

p1 <- ggplot(df_temp2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ylim(-2, 2) +
  ggtitle("Excluding articles with 1 domestic word") +
  geom_hline(yintercept = 0, linetype="dashed") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 



df_temp <- dat[dat$domestic_count<3,]

df_temp2 <- aggregate(df_temp$fit, list(df_temp$year), mean, na.rm=T)
colnames(df_temp2)[1] <- "date"

p2 <- ggplot(df_temp2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ylim(-2, 2) +
  ggtitle("Excluding articles with 3 domestic words") +
  geom_hline(yintercept = 0, linetype="dashed") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

combined_plot2 <- (p1 | p2)
combined_plot2
ggsave("./figures/oa_figure4.png", width = 20, height = 15, dpi = 300, units = "in", device='png')  



#### OA Figure 5 ##########################

# create a document feature matrix from the tokens object
dfmat_sent <- toks_sent %>% 
  dfm() %>% 
  dfm_remove(pattern = "") %>% 
  dfm_trim(min_termfreq = 50, max_termfreq = 250000)

# set the seed words for functional and territorial
# 5 words
dic <- list(positive= NA, negative = NA)
dic$positive <- list("bloc*", "region*", "civilisation*", "countr*", "nation*", "colony", "coloni")

dic$negative <- list("class*", "ideolog*", "inequalit*", "group*", "social*")


# define seed words
seed <- as.seedwords(dic)
print(seed)

# run LSS model
#tmod_lss_colony <- textmodel_lss(dfmat_sent, seeds = seed,
                          #k = 300, cache = TRUE)


# save LSS model
#save(tmod_lss_colony, file = "./model_colony.Rdata")


# load pretrained model
load("./model_colony.Rdata")
class(tmod_lss_colony)

dfmat_doc <- dfm_group(dfmat_sent)
dat_colony <- docvars(dfmat_doc)

dat_colony$year <- gsub("(\\d{4}).*", "\\1", dat_colony$date)
dat_colony$year <- gsub("(\\d{4})", "\\1-01-01", dat_colony$year)
dat_colony$year <- as.Date(dat_colony$year, "%Y-%m-%d")

# predict scores for documents
dat_colony$fit <- predict(tmod_lss_colony, newdata = dfmat_doc)

df2 <- aggregate(dat_colony$fit, list(dat_colony$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ylim(-2, 2) +
  geom_hline(yintercept = 0, linetype="dashed") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 
ggsave("./figures/oa_figure5.png", width = 10, height = 8, dpi = 300, units = "in", device='png')  



#### OA Figure 6 ###########################


# Corpus
corp_da <- corpus(dat_colony)

# tokenize texts
toks_da <- tokens(corp_da, remove_punct = TRUE)

# create a document-feature matrix
dfmat_da <- dfm(toks_da)

# And we can apply the dictionary to our data
topics <- dfm_lookup(dfmat_da, topic_dictionary)

topic_df <- cbind(dat_colony, convert(topics, to = 'data.frame'))


##### Military #########
military_df <- topic_df[topic_df$military>=3,]


military_df$year <- gsub("(\\d{4}).*", "\\1", military_df$date)
military_df$year <- gsub("(\\d{4})", "\\1-01-01", military_df$year)
military_df$year <- as.Date(military_df$year, "%Y-%m-%d")

df2 <- aggregate(military_df$fit, list(military_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p1 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +  
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  ggtitle("Military") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 


##### sport #########
sport_df <- topic_df[topic_df$sport>=3,]


sport_df$year <- gsub("(\\d{4}).*", "\\1", sport_df$date)
sport_df$year <- gsub("(\\d{4})", "\\1-01-01", sport_df$year)
sport_df$year <- as.Date(sport_df$year, "%Y-%m-%d")

df2 <- aggregate(sport_df$fit, list(sport_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p2 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Sports") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

##### politics #########
politics_df <- topic_df[topic_df$politics>=3,]


politics_df$year <- gsub("(\\d{4}).*", "\\1", politics_df$date)
politics_df$year <- gsub("(\\d{4})", "\\1-01-01", politics_df$year)
politics_df$year <- as.Date(politics_df$year, "%Y-%m-%d")

df2 <- aggregate(politics_df$fit, list(politics_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p3 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Politics") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 



##### economy #########
economy_df <- topic_df[topic_df$economy>=3,]


economy_df$year <- gsub("(\\d{4}).*", "\\1", economy_df$date)
economy_df$year <- gsub("(\\d{4})", "\\1-01-01", economy_df$year)
economy_df$year <- as.Date(economy_df$year, "%Y-%m-%d")

df2 <- aggregate(economy_df$fit, list(economy_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p4 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Economy") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 


##### social #########
social_df <- topic_df[topic_df$social>=3,]


social_df$year <- gsub("(\\d{4}).*", "\\1", social_df$date)
social_df$year <- gsub("(\\d{4})", "\\1-01-01", social_df$year)
social_df$year <- as.Date(social_df$year, "%Y-%m-%d")

df2 <- aggregate(social_df$fit, list(social_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p5 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Social") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

##### cultural #########
cultural_df <- topic_df[topic_df$cultural>=3,]


cultural_df$year <- gsub("(\\d{4}).*", "\\1", cultural_df$date)
cultural_df$year <- gsub("(\\d{4})", "\\1-01-01", cultural_df$year)
cultural_df$year <- as.Date(cultural_df$year, "%Y-%m-%d")

df2 <- aggregate(cultural_df$fit, list(cultural_df$year), mean, na.rm=T)
colnames(df2)[1] <- "date"

p6 <- ggplot(df2, aes(x=date, y=x, group = 1)) +
  geom_line() +
  geom_smooth() +
  theme_bw() +
  geom_hline(yintercept = 0, linetype="dashed") +
  ylim(-2, 2) +
  theme(text = element_text(size = 30), axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=0.5)) +
  xlab("Date") + ylab("Functional - Territorial") +
  ggtitle("Culture") +
  scale_x_date(breaks = as.Date(c("1850-01-01", "1860-01-01", "1870-01-01", "1880-01-01",
                                  "1890-01-01", "1900-01-01", "1910-01-01", "1920-01-01",
                                  "1930-01-01", "1940-01-01", "1950-01-01", "1960-01-01",
                                  "1970-01-01", "1980-01-01", "1990-01-01", "2000-01-01",
                                  "2010-01-01", "2020-01-01")), minor_breaks = function(x) seq.Date(from = as.Date("1850-01-01"), 
                                                                                                    to = max(x), 
                                                                                                    by = "5 years"), date_labels = "%Y") 

##### Combining plots ########
combined_plot <- (p4 | p1) / (p6 | p5) / (p3 | p2)
combined_plot
ggsave("./figures/oa_figure6.png", width = 15, height = 20, dpi = 300, units = "in", device='png')  








